\documentclass[final]{siamart171218}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bm}

\setlength{\oddsidemargin}{0.65in}
\setlength{\evensidemargin}{0.65in}

\title{Sketch of the ``alternating SQP'' method for fitting Poisson
  topic models}

\author{Peter Carbonetto\thanks{Dept. of Human Genetics and the Research Computing Center, University of Chicago, Chicago, IL}}

\begin{document}

\maketitle

\section{Problem statement}

Given an $n \times p$ matrix of counts $X$ with entries $x_{ij} \geq
0$, the aim is to fit a Poisson model of the counts,
\begin{align}
p(x) &= \prod_{i=1}^n \prod_{j=1}^p p(x_{ij}) \nonumber \\
     &= \prod_{i=1}^n \prod_{j=1}^p \mathrm{Poisson}(x_{ij}; \lambda_{ij}),
\label{eq:poisson-likelihood}
\end{align}
in which the Poisson rates are given by $\lambda_{ij} = \sum_{k=1}^K
l_{ik} f_{jk}$. The model is determined by a $p \times K$ matrix $F$
with entries $f_{ik} \geq 0$ (the ``factors'') and an $n \times K$
matrix $L$ with entries $l_{ik} \geq 0$ (the ``loadings''). Fitting
$F$ and $L$ is equivalent to non-negative matrix factorization with
the ``beta-divergence'' cost function \cite{lee-2001}. It can also be
used to recover a maximum-likelihood estimate for the latent Dirichlet
allocation (LDA) model \cite{blei-2003}. So fitting this model is
useful for a wide range of applications.

The log-likelihood for the Poisson model is
\begin{equation}
\log p(x \,|\, F, L) \propto
  \sum_{i=1}^n \sum_{j=1}^p x_{ij} \log
  ({\textstyle \sum_{k=1}^K l_{ik} f_{jk}})
    - \sum_{i=1}^n \sum_{j=1}^p \sum_{k=1}^K l_{ik} f_{jk},
\label{eq:poisson-log-likelihood}
\end{equation}
where the constant of proportionality is obtained from factorial terms
in the Poisson densities. Our specific aim is to find a $F$ and $L$
that maximizes the log-likelihood \eqref{eq:poisson-log-likelihood};
that is, we would like to solve
\begin{equation}
\begin{array}{ll}
\mbox{minimize} & \ell(F,L) \equiv -\log p(x \,|\, F, L) \\
\mbox{subject to} & F \geq 0, L \geq 0.
\end{array}
\label{eq:problem}
\end{equation}
In the next section, we derive an efficient approach to doing this.

\section{Block co-ordinate descent strategy}

Our strategy for solving \eqref{eq:problem} is to alternate between
solving for $F$ with $L$ fixed, and solving for $L$ with $F$
fixed. When solving for $F$ with $L$ fixed (and vice versa), the
problem naturally decomposes into a collection of much smaller
subproblems that are much more tractable to solve. All the subproblems
are of the following form:
\begin{equation}
\begin{array}{ll}
\mbox{minimize} & \phi(y; B, w) \\
\mbox{subject to} & \mbox{$y_k \geq 0$ for all $k = 1, \ldots, K$},
\end{array}
\label{eq:subproblem}
\end{equation}
in which the objective function is defined as
\begin{equation}
\phi(y; B, w) =
    - \sum_{i=1}^n w_i \log\big({\textstyle \sum_{k=1}^K b_{ik} y_k}\big)
    + \sum_{i=1}^n \sum_{k=1}^K b_{ik} y_k.
\label{eq:subproblem-objective}
\end{equation}
To see the connection between subproblem \eqref{eq:subproblem} and the
original optimization problem \eqref{eq:problem}, observe that the
negative log-likelihood can be recovered as
\begin{equation}
-\log p(x \,|\, F, L) = \sum_{i=1}^n \phi(l_i; F, x_i),
\end{equation}
where $x_i$ is the $i$th row of $X$ and $l_i$ is the $i$th row of $L$.
Alternatively, it can be recovered as
\begin{equation}
-\log p(x \,|\, F, L) = \sum_{j=1}^p \phi(f_j; L, x_j),
\end{equation}
in which $x_j$ is the $j$th column of $X$, and $f_j$ is the $j$th row
of $F$. Therefore, when $F$ is fixed, each row of $L$ can be
separately optimized by solving a problem of the form
\eqref{eq:subproblem}, and when $L$ is fixed, each row of $F$ can be
separately optimized by solving a problem of the form
\eqref{eq:subproblem}.

Initially this may seem like a sensible strategy, but directly
optimizing \eqref{eq:subproblem} turns out to be difficult to do for
numerical reasons: in practice, the entries of $B$ can be very large
or very small, resulting in solutions $y$ in which all the entries are
either very large or very small. This makes it difficult to devise an
algorithm that will work well for all possible input matrices $B$.

I propose to solve for $y$ indirectly by instead solving
\begin{equation}
\begin{array}{ll}
\mbox{minimize}   & f(t; P, u) \\
\mbox{subject to} & \mbox{$t_k \geq 0$ for all $k = 1, \ldots, K$},
\label{eq:subproblem-modified}
\end{array}
\end{equation}
in which the new objective function is
\begin{equation}
f(t; P, u) =
    - \sum_{i=1}^n u_i \log\big({\textstyle \sum_{k=1}^K p_{ik} t_k}\big)
    + \sum_{k=1}^K t_k,
\end{equation}
where I've defined
\begin{align*}
u_i    &= \frac{w_i}{\sum_{i'=1}^n w_{i'}} \\
p_{ik} &= b_{ik} \times \frac{\sum_{i'=1}^n w_{i'}}{\sum_{i'=1}^n b_{i'k}}.
\end{align*}
After finding the solution $t^{\star}$ to
\eqref{eq:subproblem-modified}, the solution $y^{\star}$ to
\eqref{eq:subproblem} is recovered as
\begin{equation}
y_k^{\star} = t_k^{\star} \times
  \frac{\sum_{i=1}^n w_i}{\sum_{i=1}^n b_{ik}}.
\end{equation}
The main advantage of solving \eqref{eq:subproblem-modified} is that
the solution is numerically well behaved; in particular, the entries
of the solution $t^{\star}$ sum to 1 \cite{kim-2019}. A
straightforward way to appreciate this result is to compare the
complementary slackness condition $\sum_{k=1}^K g_k x_k = 0$ for the
original and modified subproblems, where $g_k$ denotes the partial
derivative of the objective (either $\phi$ or $f$) with respect to the
$k$th co-ordinate (either $y_k$ or $t_k$). Further, since problem
\eqref{eq:subproblem-modified} is obtained by a simple linear
transformation of the variables, any iterate that leads to an
improvement in the solution to the modified subproblem will produce an
improvement in the solution to the original subproblem, and vice
versa.

\section{Karush-Kuhn-Tucker conditions}

To assess the quality of a solution, here I derive the first-order
Karush-Kuhn-Tucker (KKT) conditions for \eqref{eq:problem}. Written
in matrix notation, they are
\begin{align}
\nabla_F \ell^{\star}(F,L,\Omega,\Gamma) &= 0 \label{eq:kkt-1} \\
\nabla_L \ell^{\star}(F,L,\Omega,\Gamma) &= 0 \label{eq:kkt-2} \\
\Omega \odot F &= 0 \label{eq:kkt-3} \\
\Gamma \odot L &= 0 \label{eq:kkt-4}.
\end{align}
Here, I have introduced matrices of Lagrange multipliers $\Omega \geq
0, \Gamma \geq 0$ associated with the non-negativity constraints $F
\geq 0, L \geq 0$, $A \odot B$ is the Hadamard (elementwise) product
of matrices $A$ and $B$, and $\ell^{\star}(F, L, \Omega, \Gamma)$ is the
Lagrangian function:
\begin{equation}
\phi(F, L, \Omega, \Gamma) = \ell(F,L) 
- \sum_{i=1}^n \sum_{k=1}^K \gamma_{ik} l_{ik}
- \sum_{j=1}^m \sum_{k=1}^K \omega_{jk} f_{jk}.
\end{equation}
Applying conditions (\ref{eq:kkt-1}, \ref{eq:kkt-2}), we obtain
\begin{align}
\Omega &= (1 - A)^TL \\ 
\Gamma &= (1 - A)F,
\end{align}
in which $A$ is defined to be the $n \times m$ matrix with entries
$a_{ij} = x_{ij} / \lambda_{ij}$. Next, applying the complementary
conditions (\ref{eq:kkt-3}, \ref{eq:kkt-4}) results in the following
expressions for the residuals of the complementary conditions:
\begin{align}
r_F &= F \odot (1 - A)^TL \\
r_L &= L \odot (1 - A)F.
\end{align}
These residuals should vanish near a minimizer of \eqref{eq:problem}.

\bibliographystyle{siamplain}
\bibliography{altsqp}

\end{document}

